import scrapy

from ScrapyObject.spiders.utils.url_utils import *

'''
能打开网站,但网站似乎做过反爬,不能爬取
移动
scrapy crawl langyoufour -o langyoufour.json
https://17g2a.com/
'''


class LangyoufourSpider(scrapy.Spider):
    # 前缀
    prefix = 'https://'
    # 中缀
    website = '17g2a'
    # 后缀
    suffix = '.com/'
    name = 'langyoufour'
    allowed_domains = [website + '.com']
    start_urls = [prefix + website + suffix]

    def __init__(self):
        self.i = 0

    def parse(self, response):
        # 获取字符串类型的网页内容
        content = get_data(response)
        video_url = get_video_url_three(content)
        if len(video_url):
            self.i = self.i + 1
            yield get_video_item(id=self.i, tags='', url=response.url, name='', pUrl="", vUrl=format_url_one(video_url[0]))
        p_url = response.xpath("//div[@class='videocontentcell titletablegreen6']//img/@ src").extract()
        if len(p_url):
            name = response.xpath("//div[@class='videocontentcell titletablegreen6']//img/@ title").extract()
            url = response.xpath("//a[@title='HTML5(MP4)播放']/@ href").extract()
            tags = response.xpath("//div[@class='cell3 colorlmtop']/text()").extract()[0].strip()
            comma_index = tags.find('\xa0')
            if comma_index != -1:
                result = tags[:comma_index]
            else:
                result = tags
            self.i = self.i + 1
            yield get_video_item(id=self.i, tags=result, url=split_joint(self.prefix + self.website + self.suffix, url[0]), name=name[0], pUrl=p_url[0], vUrl='')
        url_list = get_url(content)
        # 提取url
        for url in url_list:
            print(url)
            # if url.startswith('/') and (url.endswith('.html') or url.endswith('/')):
            #     yield scrapy.Request(split_joint(self.prefix + self.website + self.suffix, url), callback=self.parse)